{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bfbbfc2c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: http://mirrors.aliyun.com/pypi/simple/\n",
      "Collecting mindnlp==0.4.0\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/0f/a8/5a072852d28a51417b5e330b32e6ae5f26b491ef01a15ba968e77f785e69/mindnlp-0.4.0-py3-none-any.whl (8.4 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.4/8.4 MB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: mindspore>=2.2.14 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindnlp==0.4.0) (2.3.0)\n",
      "Requirement already satisfied: tqdm in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindnlp==0.4.0) (4.65.0)\n",
      "Requirement already satisfied: requests in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindnlp==0.4.0) (2.31.0)\n",
      "Collecting datasets (from mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/ed/a5/33cf000137545a08b0a3a6ea76c8ccbd87917f78bb5d737f9f56f3b11ef6/datasets-3.1.0-py3-none-any.whl (480 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting evaluate (from mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/a2/e7/cbca9e2d2590eb9b5aa8f7ebabe1beb1498f9462d2ecede5c9fd9735faaf/evaluate-0.4.3-py3-none-any.whl (84 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.0/84.0 kB\u001b[0m \u001b[31m20.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting tokenizers==0.19.1 (from mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/ba/26/139bd2371228a0e203da7b3e3eddcb02f45b2b7edd91df00e342e4b55e13/tokenizers-0.19.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (3.6 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting safetensors (from mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/08/94/7760694760f1e5001bd62c93155b8b7ccb652d1f4d0161d1e72b5bf9581a/safetensors-0.4.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (442 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m442.4/442.4 kB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: sentencepiece in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindnlp==0.4.0) (0.1.99)\n",
      "Requirement already satisfied: regex in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindnlp==0.4.0) (2023.10.3)\n",
      "Collecting addict (from mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/6a/00/b08f23b7d7e1e14ce01419a467b583edbb93c6cdb8654e54a9cc579cd61f/addict-2.4.0-py3-none-any.whl (3.8 kB)\n",
      "Requirement already satisfied: ml-dtypes in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindnlp==0.4.0) (0.2.0)\n",
      "Collecting pyctcdecode (from mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/a5/8a/93e2118411ae5e861d4f4ce65578c62e85d0f1d9cb389bd63bd57130604e/pyctcdecode-0.5.0-py2.py3-none-any.whl (39 kB)\n",
      "Requirement already satisfied: jieba in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindnlp==0.4.0) (0.42.1)\n",
      "Collecting pytest==7.2.0 (from mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/67/68/a5eb36c3a8540594b6035e6cdae40c1ef1b6a2bfacbecc3d1a544583c078/pytest-7.2.0-py3-none-any.whl (316 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m316.8/316.8 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting pillow>=10.0.0 (from mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/78/66/7c5e44ab2c0123710a5d4692a4ee5931ac438efd7730ac395e305902346e/pillow-11.0.0-cp39-cp39-manylinux_2_28_aarch64.whl (4.2 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.2/4.2 MB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: attrs>=19.2.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from pytest==7.2.0->mindnlp==0.4.0) (23.1.0)\n",
      "Collecting iniconfig (from pytest==7.2.0->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl (5.9 kB)\n",
      "Requirement already satisfied: packaging in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from pytest==7.2.0->mindnlp==0.4.0) (23.2)\n",
      "Collecting pluggy<2.0,>=0.12 (from pytest==7.2.0->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl (20 kB)\n",
      "Requirement already satisfied: exceptiongroup>=1.0.0rc8 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from pytest==7.2.0->mindnlp==0.4.0) (1.1.3)\n",
      "Collecting tomli>=1.0.0 (from pytest==7.2.0->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/de/f7/4da0ffe1892122c9ea096c57f64c2753ae5dd3ce85488802d11b0992cc6d/tomli-2.1.0-py3-none-any.whl (13 kB)\n",
      "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from tokenizers==0.19.1->mindnlp==0.4.0) (0.18.0)\n",
      "Requirement already satisfied: numpy>=1.17.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindspore>=2.2.14->mindnlp==0.4.0) (1.23.5)\n",
      "Requirement already satisfied: protobuf>=3.13.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindspore>=2.2.14->mindnlp==0.4.0) (3.20.3)\n",
      "Requirement already satisfied: asttokens>=2.0.4 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindspore>=2.2.14->mindnlp==0.4.0) (2.4.1)\n",
      "Requirement already satisfied: scipy>=1.5.4 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindspore>=2.2.14->mindnlp==0.4.0) (1.11.3)\n",
      "Requirement already satisfied: psutil>=5.6.1 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindspore>=2.2.14->mindnlp==0.4.0) (5.9.5)\n",
      "Requirement already satisfied: astunparse>=1.6.3 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindspore>=2.2.14->mindnlp==0.4.0) (1.6.3)\n",
      "Requirement already satisfied: filelock in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from datasets->mindnlp==0.4.0) (3.13.1)\n",
      "Collecting pyarrow>=15.0.0 (from datasets->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/fb/40/4c27579387917f55ce55f136fb20ce53bfe8c9809c2e2d864e5ec044330a/pyarrow-18.0.0-cp39-cp39-manylinux_2_28_aarch64.whl (38.6 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.6/38.6 MB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting dill<0.3.9,>=0.3.0 (from datasets->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl (116 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: pandas in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from datasets->mindnlp==0.4.0) (2.1.2)\n",
      "Collecting requests (from mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl (64 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m64.9/64.9 kB\u001b[0m \u001b[31m16.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting tqdm (from mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/2b/78/57043611a16c655c8350b4c01b8d6abfb38cc2acb475238b62c2146186d7/tqdm-4.67.0-py3-none-any.whl (78 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.6/78.6 kB\u001b[0m \u001b[31m18.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting xxhash (from datasets->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/b4/92/9ac297e3487818f429bcf369c1c6a097edf5b56ed6fc1feff4c1882e87ef/xxhash-3.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (220 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m220.6/220.6 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting multiprocess<0.70.17 (from datasets->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl (133 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m133.4/133.4 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets->mindnlp==0.4.0) (2023.10.0)\n",
      "Collecting aiohttp (from datasets->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/f6/db/330bc465d22c8d1c5be4d7fc370ac4c11385084dbb3ca3eaf353dd404565/aiohttp-3.11.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (1.6 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting huggingface-hub<1.0,>=0.16.4 (from tokenizers==0.19.1->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/60/bf/cea0b9720c32fa01b0c4ec4b16b9f4ae34ca106b202ebbae9f03ab98cd8f/huggingface_hub-0.26.2-py3-none-any.whl (447 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m447.5/447.5 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: pyyaml>=5.1 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from datasets->mindnlp==0.4.0) (6.0.1)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from requests->mindnlp==0.4.0) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from requests->mindnlp==0.4.0) (3.4)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from requests->mindnlp==0.4.0) (2.0.7)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from requests->mindnlp==0.4.0) (2023.7.22)\n",
      "Collecting pygtrie<3.0,>=2.1 (from pyctcdecode->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/ec/cd/bd196b2cf014afb1009de8b0f05ecd54011d881944e62763f3c1b1e8ef37/pygtrie-2.5.0-py3-none-any.whl (25 kB)\n",
      "Collecting hypothesis<7,>=6.14 (from pyctcdecode->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/36/1a/b56f2d3bc2c0e129634891b3f0000cf07e96db5c4a460b7cf2351d19081e/hypothesis-6.118.8-py3-none-any.whl (471 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m471.8/471.8 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: six>=1.12.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from asttokens>=2.0.4->mindspore>=2.2.14->mindnlp==0.4.0) (1.16.0)\n",
      "Requirement already satisfied: wheel<1.0,>=0.23.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from astunparse>=1.6.3->mindspore>=2.2.14->mindnlp==0.4.0) (0.41.3)\n",
      "Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/f7/d8/120cd0fe3e8530df0539e71ba9683eade12cae103dd7543e50d15f737917/aiohappyeyeballs-2.4.3-py3-none-any.whl (14 kB)\n",
      "Collecting aiosignal>=1.1.2 (from aiohttp->datasets->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/76/ac/a7305707cb852b7e16ff80eaf5692309bde30e2b1100a1fcacdc8f731d97/aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n",
      "Collecting frozenlist>=1.1.1 (from aiohttp->datasets->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/08/04/e2fddc92135276e07addbc1cf413acffa0c2d848b3e54cacf684e146df49/frozenlist-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (241 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m241.8/241.8 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting multidict<7.0,>=4.5 (from aiohttp->datasets->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/89/87/d451d45aab9e422cb0fb2f7720c31a4c1d3012c740483c37f642eba568fb/multidict-6.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (126 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m126.2/126.2 kB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting propcache>=0.2.0 (from aiohttp->datasets->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/e2/dc/60d444610bc5b1d7a758534f58362b1bcee736a785473f8a39c91f05aad1/propcache-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (211 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.1/211.1 kB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting yarl<2.0,>=1.17.0 (from aiohttp->datasets->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/12/5d/8bd30a5d2269b0f4062ce10804c79c2bdffde6be4c0501d1761ee99e9bc7/yarl-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (316 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m316.1/316.1 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting async-timeout<6.0,>=4.0 (from aiohttp->datasets->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl (6.2 kB)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers==0.19.1->mindnlp==0.4.0) (4.8.0)\n",
      "Collecting sortedcontainers<3.0.0,>=2.1.0 (from hypothesis<7,>=6.14->pyctcdecode->mindnlp==0.4.0)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from pandas->datasets->mindnlp==0.4.0) (2.8.2)\n",
      "Requirement already satisfied: pytz>=2020.1 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from pandas->datasets->mindnlp==0.4.0) (2023.3.post1)\n",
      "Requirement already satisfied: tzdata>=2022.1 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from pandas->datasets->mindnlp==0.4.0) (2023.3)\n",
      "\u001b[33mDEPRECATION: moxing-framework 2.1.16.2ae09d45 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of moxing-framework or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n",
      "\u001b[0mInstalling collected packages: sortedcontainers, pygtrie, addict, xxhash, tqdm, tomli, safetensors, requests, pyarrow, propcache, pluggy, pillow, multidict, iniconfig, hypothesis, frozenlist, dill, async-timeout, aiohappyeyeballs, yarl, pytest, pyctcdecode, multiprocess, huggingface-hub, aiosignal, tokenizers, aiohttp, datasets, evaluate, mindnlp\n",
      "  Attempting uninstall: tqdm\n",
      "    Found existing installation: tqdm 4.65.0\n",
      "    Uninstalling tqdm-4.65.0:\n",
      "      Successfully uninstalled tqdm-4.65.0\n",
      "  Attempting uninstall: requests\n",
      "    Found existing installation: requests 2.31.0\n",
      "    Uninstalling requests-2.31.0:\n",
      "      Successfully uninstalled requests-2.31.0\n",
      "  Attempting uninstall: pillow\n",
      "    Found existing installation: Pillow 9.0.1\n",
      "    Uninstalling Pillow-9.0.1:\n",
      "      Successfully uninstalled Pillow-9.0.1\n",
      "  Attempting uninstall: huggingface-hub\n",
      "    Found existing installation: huggingface-hub 0.18.0\n",
      "    Uninstalling huggingface-hub-0.18.0:\n",
      "      Successfully uninstalled huggingface-hub-0.18.0\n",
      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
      "gradio 3.50.2 requires pillow<11.0,>=8.0, but you have pillow 11.0.0 which is incompatible.\n",
      "imageio 2.31.6 requires pillow<10.1.0,>=8.3.2, but you have pillow 11.0.0 which is incompatible.\n",
      "mindtorch 0.3.0 requires tqdm==4.65.0, but you have tqdm 4.67.0 which is incompatible.\u001b[0m\u001b[31m\n",
      "\u001b[0mSuccessfully installed addict-2.4.0 aiohappyeyeballs-2.4.3 aiohttp-3.11.0 aiosignal-1.3.1 async-timeout-5.0.1 datasets-3.1.0 dill-0.3.8 evaluate-0.4.3 frozenlist-1.5.0 huggingface-hub-0.26.2 hypothesis-6.118.8 iniconfig-2.0.0 mindnlp-0.4.0 multidict-6.1.0 multiprocess-0.70.16 pillow-11.0.0 pluggy-1.5.0 propcache-0.2.0 pyarrow-18.0.0 pyctcdecode-0.5.0 pygtrie-2.5.0 pytest-7.2.0 requests-2.32.3 safetensors-0.4.5 sortedcontainers-2.4.0 tokenizers-0.19.1 tomli-2.1.0 tqdm-4.67.0 xxhash-3.5.0 yarl-1.17.1\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0mFound existing installation: soundfile 0.12.1\n",
      "Uninstalling soundfile-0.12.1:\n",
      "  Successfully uninstalled soundfile-0.12.1\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0mLooking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
      "Collecting mindspore==2.3.1\n",
      "  Downloading https://ms-release.obs.cn-north-4.myhuaweicloud.com/2.3.1/MindSpore/unified/aarch64/mindspore-2.3.1-cp39-cp39-linux_aarch64.whl (328.8 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m328.8/328.8 MB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: numpy<2.0.0,>=1.20.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindspore==2.3.1) (1.23.5)\n",
      "Requirement already satisfied: protobuf>=3.13.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindspore==2.3.1) (3.20.3)\n",
      "Requirement already satisfied: asttokens>=2.0.4 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindspore==2.3.1) (2.4.1)\n",
      "Requirement already satisfied: pillow>=6.2.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindspore==2.3.1) (11.0.0)\n",
      "Requirement already satisfied: scipy>=1.5.4 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindspore==2.3.1) (1.11.3)\n",
      "Requirement already satisfied: packaging>=20.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindspore==2.3.1) (23.2)\n",
      "Requirement already satisfied: psutil>=5.6.1 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindspore==2.3.1) (5.9.5)\n",
      "Requirement already satisfied: astunparse>=1.6.3 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from mindspore==2.3.1) (1.6.3)\n",
      "Requirement already satisfied: six>=1.12.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from asttokens>=2.0.4->mindspore==2.3.1) (1.16.0)\n",
      "Requirement already satisfied: wheel<1.0,>=0.23.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from astunparse>=1.6.3->mindspore==2.3.1) (0.41.3)\n",
      "\u001b[33mDEPRECATION: moxing-framework 2.1.16.2ae09d45 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of moxing-framework or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n",
      "\u001b[0mInstalling collected packages: mindspore\n",
      "  Attempting uninstall: mindspore\n",
      "    Found existing installation: mindspore 2.3.0\n",
      "    Uninstalling mindspore-2.3.0:\n",
      "      Successfully uninstalled mindspore-2.3.0\n",
      "Successfully installed mindspore-2.3.1\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0m"
     ]
    }
   ],
   "source": [
    "#安装环境，mindnlp 0.4.0,minspore 2.3.1\n",
    "!pip install mindnlp==0.4.0\n",
    "!pip uninstall soundfile -y\n",
    "!pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/2.3.1/MindSpore/unified/aarch64/mindspore-2.3.1-cp39-cp39-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://pypi.tuna.tsinghua.edu.cn/simple"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "1e40b8a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: http://mirrors.aliyun.com/pypi/simple/\n",
      "Collecting openmind_hub\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/8a/0e/c134b08529c218feab1a792e052d625cfe143ac12baf68c3ee3d17dc2a90/openmind_hub-0.9.0-py3-none-any.whl (124 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.7/124.7 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting filelock>=3.14.0 (from openmind_hub)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl (16 kB)\n",
      "Collecting fsspec>=2024.2.0 (from openmind_hub)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/c6/b2/454d6e7f0158951d8a78c2e1eb4f69ae81beb8dca5fee9809c6c99e9d0d0/fsspec-2024.10.0-py3-none-any.whl (179 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.6/179.6 kB\u001b[0m \u001b[31m5.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting jinja2>=3.1.4 (from openmind_hub)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/31/80/3a54838c3fb461f6fec263ebf3a3a41771bd05190238de3486aae8540c36/jinja2-3.1.4-py3-none-any.whl (133 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m133.3/133.3 kB\u001b[0m \u001b[31m25.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: pyyaml>=6.0.1 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from openmind_hub) (6.0.1)\n",
      "Collecting requests==2.32.2 (from openmind_hub)\n",
      "  Downloading http://mirrors.aliyun.com/pypi/packages/c3/20/748e38b466e0819491f0ce6e90ebe4184966ee304fe483e2c414b0f4ef07/requests-2.32.2-py3-none-any.whl (63 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m63.9/63.9 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: tqdm>=4.66.4 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from openmind_hub) (4.67.0)\n",
      "Requirement already satisfied: urllib3>=2.0.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from openmind_hub) (2.0.7)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from requests==2.32.2->openmind_hub) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from requests==2.32.2->openmind_hub) (3.4)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from requests==2.32.2->openmind_hub) (2023.7.22)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages (from jinja2>=3.1.4->openmind_hub) (2.1.3)\n",
      "\u001b[33mDEPRECATION: moxing-framework 2.1.16.2ae09d45 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of moxing-framework or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n",
      "\u001b[0mInstalling collected packages: requests, jinja2, fsspec, filelock, openmind_hub\n",
      "  Attempting uninstall: requests\n",
      "    Found existing installation: requests 2.32.3\n",
      "    Uninstalling requests-2.32.3:\n",
      "      Successfully uninstalled requests-2.32.3\n",
      "  Attempting uninstall: jinja2\n",
      "    Found existing installation: Jinja2 3.1.2\n",
      "    Uninstalling Jinja2-3.1.2:\n",
      "      Successfully uninstalled Jinja2-3.1.2\n",
      "  Attempting uninstall: fsspec\n",
      "    Found existing installation: fsspec 2023.10.0\n",
      "    Uninstalling fsspec-2023.10.0:\n",
      "      Successfully uninstalled fsspec-2023.10.0\n",
      "  Attempting uninstall: filelock\n",
      "    Found existing installation: filelock 3.13.1\n",
      "    Uninstalling filelock-3.13.1:\n",
      "      Successfully uninstalled filelock-3.13.1\n",
      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
      "datasets 3.1.0 requires fsspec[http]<=2024.9.0,>=2023.1.0, but you have fsspec 2024.10.0 which is incompatible.\n",
      "gradio 3.50.2 requires pillow<11.0,>=8.0, but you have pillow 11.0.0 which is incompatible.\u001b[0m\u001b[31m\n",
      "\u001b[0mSuccessfully installed filelock-3.16.1 fsspec-2024.10.0 jinja2-3.1.4 openmind_hub-0.9.0 requests-2.32.2\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0m"
     ]
    }
   ],
   "source": [
    "#安装模型下载工具\n",
    "!pip install openmind_hub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e02088c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "#介绍：\n",
    "#LayoutLMv3是一个针对文档理解任务的预训练模型，它结合了文本、布局和视觉信息，处理具有复杂结构的文档，通常有发票、表单和其他类型的非结构化文本。\n",
    "#作为一个基本的预训练模型，LayoutLMv3主要提供预训练能力，经过微调后，适用于各种下游任务，如文档分类、信息提取和问答等。\n",
    "#通常情况下，微调后的LayoutLMv3配合其它模型或者工具，比如常见的有与光学字符识别（OCR）技术结合，用于从扫描文档中提取文本和布局信息，这种结合能够有效处理实际应用中的文档，获取更多信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6be35a90",
   "metadata": {},
   "outputs": [],
   "source": [
    "#示例说明：我们可以通过OCR模型和工具，在一个文档中获取文本和其对应的位置信息，但对于文本更深层次的含义，光凭借OCR技术是无法获取的，而LayoutLMv3模型就提供了这样的能力\n",
    "#首先通常使用OCR技术获得文本和位置信息，然后将这些信息传入预训练好的LayoutLMv3模型，并可理解其在文档中的语义，比如：每个文本的类别，是日期、金额、还是姓名等等，或者识别整个文档属于什么类别"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "07dae0b1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Archive:  data_funsd-layoutlmv3.zip\n",
      "   creating: data_funsd-layoutlmv3/\n",
      "  inflating: data_funsd-layoutlmv3/test_0.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_1.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_10.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_11.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_12.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_13.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_14.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_15.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_16.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_17.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_18.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_19.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_2.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_20.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_21.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_22.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_23.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_24.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_25.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_26.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_27.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_28.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_29.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_3.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_30.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_31.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_32.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_33.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_34.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_35.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_36.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_37.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_38.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_39.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_4.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_40.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_41.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_42.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_43.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_44.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_45.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_46.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_47.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_48.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_49.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_5.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_6.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_7.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_8.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_9.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/test_data.json  \n",
      "  inflating: data_funsd-layoutlmv3/train_0.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_1.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_10.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_100.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_101.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_102.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_103.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_104.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_105.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_106.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_107.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_108.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_109.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_11.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_110.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_111.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_112.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_113.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_114.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_115.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_116.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_117.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_118.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_119.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_12.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_120.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_121.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_122.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_123.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_124.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_125.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_126.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_127.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_128.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_129.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_13.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_130.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_131.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_132.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_133.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_134.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_135.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_136.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_137.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_138.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_139.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_14.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_140.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_141.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_142.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_143.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_144.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_145.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_146.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_147.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_148.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_15.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_16.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_17.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_18.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_19.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_2.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_20.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_21.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_22.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_23.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_24.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_25.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_26.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_27.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_28.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_29.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_3.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_30.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_31.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_32.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_33.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_34.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_35.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_36.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_37.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_38.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_39.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_4.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_40.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_41.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_42.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_43.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_44.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_45.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_46.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_47.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_48.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_49.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_5.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_50.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_51.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_52.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_53.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_54.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_55.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_56.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_57.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_58.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_59.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_6.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_60.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_61.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_62.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_63.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_64.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_65.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_66.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_67.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_68.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_69.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_7.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_70.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_71.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_72.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_73.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_74.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_75.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_76.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_77.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_78.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_79.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_8.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_80.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_81.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_82.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_83.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_84.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_85.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_86.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_87.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_88.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_89.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_9.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_90.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_91.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_92.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_93.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_94.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_95.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_96.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_97.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_98.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_99.jpg  \n",
      "  inflating: data_funsd-layoutlmv3/train_data.json  \n"
     ]
    }
   ],
   "source": [
    "#准备体验数据集，下载data_funsd-layoutlmv3.zip，该数据集包含多个文档图片，且已经通过ocr等技术，标注好了文档中文本的位置等相关信息\n",
    "#该数据集的详细说明，可以参考:https://huggingface.co/datasets/nielsr/funsd-layoutlmv3\n",
    "\n",
    "#解压data_funsd-layoutlmv3.zip\n",
    "!unzip data_funsd-layoutlmv3.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ce3a5cfe",
   "metadata": {},
   "outputs": [],
   "source": [
    "#定义数据集加载方法\n",
    "import json\n",
    "from PIL import Image\n",
    "\n",
    "def load_dataset(load_path):\n",
    "    keys = ['train', 'test']\n",
    "\n",
    "    data_dict = {}\n",
    "\n",
    "    for key in keys:\n",
    "        data_list = []\n",
    "        with open(f'{load_path}/{key}_data.json', mode='r', encoding='utf8') as f:\n",
    "            json_data = json.load(f)\n",
    "            for i in range(len(json_data['id'])):\n",
    "                id = json_data['id'][i]\n",
    "                tokens = json_data['tokens'][i]\n",
    "                bboxes = json_data['bboxes'][i]\n",
    "                ner_tags = json_data['ner_tags'][i]\n",
    "                image = Image.open(f'{load_path}/{key}_{i}.jpg')\n",
    "                data_list.append({\"id\":id, \"tokens\":tokens, \"bboxes\":bboxes, \"ner_tags\":ner_tags, \"image\":image})\n",
    "        data_dict[key] = data_list\n",
    "    return data_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b556e137",
   "metadata": {},
   "outputs": [],
   "source": [
    "#加载数据集，读取其中的一条数据，用来后面layoutlmv3模型的推理分析\n",
    "load_path = 'data_funsd-layoutlmv3'\n",
    "data_dict = load_dataset(load_path)\n",
    "example = data_dict['train'][0]\n",
    "image = example[\"image\"] #文档图片，是个pil库的image对象\n",
    "words = example[\"tokens\"] #文档里的文本对应的token\n",
    "boxes = example[\"bboxes\"] #每个文本的坐标\n",
    "word_labels = example[\"ner_tags\"] #每个文本的真实标签，可以用来评估layoutlmv3的判断是否准确，输出loss等信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e82485a8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /tmp/jieba.cache\n",
      "Loading model cost 1.327 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "#导入mindnlp中相关的类\n",
    "import mindspore\n",
    "from mindspore import ops\n",
    "from mindnlp.transformers import AutoProcessor, AutoModelForTokenClassification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3f31eaba",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[WARNING][2024-11-14 13:59:38,740]: You are running openMind Hub Client as an Admin user, which is not recommended.\n",
      "Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]\n",
      ".gitattributes: 100%|██████████| 1.52k/1.52k [00:00<00:00, 122kB/s]\n",
      "Fetching 15 files:   7%|▋         | 1/15 [00:06<01:30,  6.46s/it]\n",
      "merges.txt:   0%|          | 0.00/506k [00:00<?, ?B/s]\u001b[A\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "README.md:   0%|          | 0.00/1.37k [00:00<?, ?B/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
      "\n",
      "\n",
      "\n",
      "preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]\u001b[A\u001b[A\u001b[A\u001b[A\n",
      "\n",
      "\n",
      "config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]\u001b[A\u001b[A\u001b[A\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "all_results.json:   0%|          | 0.00/519 [00:00<?, ?B/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
      "\n",
      "preprocessor_config.json: 100%|██████████| 275/275 [00:00<00:00, 4.02kB/s]\n",
      "README.md: 100%|██████████| 1.37k/1.37k [00:00<00:00, 15.7kB/s]\n",
      "config.json: 100%|██████████| 1.38k/1.38k [00:00<00:00, 23.2kB/s]\n",
      "all_results.json: 100%|██████████| 519/519 [00:00<00:00, 11.0kB/s]\n",
      "eval_results.json: 100%|██████████| 320/320 [00:00<00:00, 10.3kB/s]\n",
      "Fetching 15 files:  20%|██        | 3/15 [00:13<00:54,  4.52s/it]\n",
      "merges.txt: 100%|██████████| 506k/506k [00:09<00:00, 51.1kB/s]it]\n",
      "\n",
      "\n",
      "pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]\u001b[A\u001b[A\n",
      "special_tokens_map.json: 100%|██████████| 772/772 [00:00<00:00, 94.2kB/s]\n",
      "\n",
      "train_results.json:   0%|          | 0.00/202 [00:00<?, ?B/s]\u001b[A\n",
      "train_results.json: 100%|██████████| 202/202 [00:00<00:00, 287B/s]\u001b[A\n",
      "\n",
      "tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]\u001b[A\n",
      "\n",
      "\n",
      "trainer_state.json:   0%|          | 0.00/19.0k [00:00<?, ?B/s]\u001b[A\u001b[A\u001b[A\n",
      "\n",
      "\n",
      "\n",
      "trainer_state.json: 100%|██████████| 19.0k/19.0k [00:00<00:00, 663kB/s]\n",
      "tokenizer_config.json: 100%|██████████| 1.18k/1.18k [00:00<00:00, 21.0kB/s]\n",
      "\n",
      "\n",
      "pytorch_model.bin:   2%|▏         | 10.5M/501M [00:03<02:24, 3.39MB/s]\u001b[A\u001b[A\n",
      "vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]\u001b[A\n",
      "\n",
      "pytorch_model.bin:   4%|▍         | 21.0M/501M [00:03<01:14, 6.48MB/s]\u001b[A\u001b[A\n",
      "vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 1.22MB/s]\u001b[A\n",
      "\n",
      "\n",
      "pytorch_model.bin:   6%|▋         | 31.5M/501M [00:04<00:47, 9.88MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:   8%|▊         | 41.9M/501M [00:04<00:35, 13.1MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  10%|█         | 52.4M/501M [00:04<00:27, 16.1MB/s]\u001b[A\u001b[A\n",
      "\n",
      "\n",
      "\n",
      "tokenizer.json: 100%|██████████| 1.36M/1.36M [00:02<00:00, 650kB/s]\u001b[A\u001b[A\u001b[A\u001b[A\n",
      "\n",
      "\n",
      "pytorch_model.bin:  13%|█▎        | 62.9M/501M [00:05<00:23, 18.6MB/s]\u001b[A\u001b[A\n",
      "training_args.bin: 100%|██████████| 2.93k/2.93k [00:00<00:00, 112kB/s]\n",
      "\n",
      "\n",
      "pytorch_model.bin:  15%|█▍        | 73.4M/501M [00:05<00:20, 20.5MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  17%|█▋        | 83.9M/501M [00:06<00:19, 21.2MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  19%|█▉        | 94.4M/501M [00:06<00:17, 22.7MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  21%|██        | 105M/501M [00:06<00:16, 23.9MB/s] \u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  23%|██▎       | 115M/501M [00:07<00:15, 24.7MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  25%|██▌       | 126M/501M [00:07<00:14, 25.2MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  27%|██▋       | 136M/501M [00:08<00:14, 25.8MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  29%|██▉       | 147M/501M [00:08<00:13, 26.2MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  31%|███▏      | 157M/501M [00:08<00:12, 26.5MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  33%|███▎      | 168M/501M [00:09<00:12, 26.6MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  36%|███▌      | 178M/501M [00:09<00:12, 25.7MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  38%|███▊      | 189M/501M [00:10<00:12, 26.0MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  40%|███▉      | 199M/501M [00:10<00:11, 26.7MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  42%|████▏     | 210M/501M [00:10<00:10, 26.8MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  44%|████▍     | 220M/501M [00:11<00:10, 26.9MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  46%|████▌     | 231M/501M [00:11<00:10, 27.0MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  48%|████▊     | 241M/501M [00:12<00:10, 25.9MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  50%|█████     | 252M/501M [00:12<00:09, 25.8MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  52%|█████▏    | 262M/501M [00:12<00:09, 26.1MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  54%|█████▍    | 273M/501M [00:13<00:08, 26.3MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  56%|█████▋    | 283M/501M [00:13<00:08, 26.4MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  59%|█████▊    | 294M/501M [00:14<00:07, 26.5MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  61%|██████    | 304M/501M [00:14<00:07, 26.6MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  63%|██████▎   | 315M/501M [00:14<00:07, 26.6MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  65%|██████▍   | 325M/501M [00:15<00:06, 26.5MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  67%|██████▋   | 336M/501M [00:15<00:06, 26.4MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  69%|██████▉   | 346M/501M [00:15<00:05, 26.5MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  71%|███████   | 357M/501M [00:16<00:05, 24.8MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  73%|███████▎  | 367M/501M [00:16<00:05, 25.2MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  75%|███████▌  | 377M/501M [00:17<00:04, 25.3MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  77%|███████▋  | 388M/501M [00:17<00:04, 25.7MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  79%|███████▉  | 398M/501M [00:18<00:03, 26.1MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  82%|████████▏ | 409M/501M [00:18<00:03, 26.2MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  84%|████████▎ | 419M/501M [00:18<00:03, 26.4MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  86%|████████▌ | 430M/501M [00:19<00:02, 26.4MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  88%|████████▊ | 440M/501M [00:19<00:02, 26.5MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  90%|████████▉ | 451M/501M [00:20<00:01, 26.6MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  92%|█████████▏| 461M/501M [00:20<00:01, 26.5MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  94%|█████████▍| 472M/501M [00:20<00:01, 26.7MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  96%|█████████▌| 482M/501M [00:21<00:00, 25.7MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin:  98%|█████████▊| 493M/501M [00:21<00:00, 25.7MB/s]\u001b[A\u001b[A\n",
      "\n",
      "pytorch_model.bin: 100%|██████████| 501M/501M [00:22<00:00, 22.8MB/s]\u001b[A\u001b[A\n",
      "Fetching 15 files: 100%|██████████| 15/15 [00:40<00:00,  2.69s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'/tmp/code/layoutlmv3/layoutlmv3-base-finetuned-funsd'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#在魔乐社区下载模型文件\n",
    "from openmind_hub import snapshot_download\n",
    "snapshot_download(repo_id=\"wuyiqun/layoutlmv3-base-finetuned-funsd\", local_dir='./layoutlmv3-base-finetuned-funsd/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b25d7251",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindnlp/transformers/tokenization_utils_base.py:1526: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted, and will be then set to `False` by default. \n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[MS_ALLOC_CONF]Runtime config:  enable_vmm:True  vmm_align_size:2MB\n",
      "<class 'mindnlp.transformers.models.layoutlmv3.modeling_layoutlmv3.LayoutLMv3ForTokenClassification'>\n"
     ]
    }
   ],
   "source": [
    "#运行文本token分类的例子\n",
    "\n",
    "#通过加载layoutlmv3-base-finetuned-funsd模型文件，创建文本token的分类模型，因为当前数据集中的文本类型有7个，所以num_labels=7\n",
    "processor = AutoProcessor.from_pretrained(\"./layoutlmv3-base-finetuned-funsd\", apply_ocr=False)\n",
    "model = AutoModelForTokenClassification.from_pretrained(\"./layoutlmv3-base-finetuned-funsd\", num_labels=7)\n",
    "print(type(model))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "fedf2225",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loss:0.00053317123\n",
      "logits.shape:(1, 208, 7)\n",
      "all_classid:[[6 0 0 0 3 3 4 4 3 4 5 6 3 3 0 0 0 0 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2\n",
      "  3 4 4 4 4 4 5 6 6 6 6 6 6 6 6 6 6 5 6 6 6 6 3 4 4 4 4 5 6 6 6 6 6 6 3 4\n",
      "  4 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6\n",
      "  6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 3 4 4 4 4\n",
      "  4 4 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 2 2\n",
      "  2 2 2 2 2 3 4 4 5 6 6 6 6 6 6 6 5 6 6 6 6 6 0 0 0 0 0 6]]\n"
     ]
    }
   ],
   "source": [
    "#奖从数据集获取的一张图片，以及对应的文本token和左边，经过处理，传入模型，这里还传入了word_labels，这是数据集中标注好的每个文本token的类别，以此用来计算模型输出的loss\n",
    "#模型输出的logits.shape为(1, 208, 7)，其中208是该图片中有208个文本，7对应上面的类别数\n",
    "encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors=\"ms\")\n",
    "\n",
    "outputs = model(**encoding)\n",
    "loss = outputs.loss\n",
    "logits = outputs.logits\n",
    "print(f'loss:{loss}')\n",
    "print(f'logits.shape:{logits.shape}')\n",
    "all_classid = ops.argmax(logits, dim=2)\n",
    "print(f'all_classid:{all_classid}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5b26597",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
